library(readr)
library(ggplot2)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggdark)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggpubr)
library(xgboost)
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:plotly':
## 
##     slice
## The following object is masked from 'package:dplyr':
## 
##     slice
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
## 
##     select
## The following object is masked from 'package:dplyr':
## 
##     select
library(caret)
## Loading required package: lattice
library(corrplot)
## corrplot 0.92 loaded
library(ggExtra) # Load ggExtra
library(xgboost) # Load XGBoost
source("a_insights_shap_functions.r") # Load SHAP functions
library(Metrics) # Load metrics
## 
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
## 
##     precision, recall
library(pROC) # Load proc 
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following object is masked from 'package:Metrics':
## 
##     auc
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
day <- read_csv("day_approach_maskedID_timeseries.csv")
## Rows: 42766 Columns: 73
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (73): nr. sessions, total km, km Z3-4, km Z5-T1-T2, km sprinting, streng...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
week <- read_csv("week_approach_maskedID_timeseries.csv")
## Rows: 42798 Columns: 72
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (72): nr. sessions, nr. rest days, total kms, max km one day, total km Z...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Remove rows with missing or invalid values in injury column
week <- week[complete.cases(week$injury), ]
summary(week)
##   nr. sessions    nr. rest days     total kms      max km one day  
##  Min.   : 0.000   Min.   :0.000   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.: 5.000   1st Qu.:1.000   1st Qu.: 22.80   1st Qu.:  9.00  
##  Median : 6.000   Median :1.000   Median : 44.80   Median : 13.40  
##  Mean   : 5.809   Mean   :1.875   Mean   : 49.54   Mean   : 14.01  
##  3rd Qu.: 7.000   3rd Qu.:3.000   3rd Qu.: 70.10   3rd Qu.: 18.30  
##  Max.   :14.000   Max.   :7.000   Max.   :242.00   Max.   :131.00  
##  total km Z3-Z4-Z5-T1-T2 nr. tough sessions (effort in Z5, T1 or T2)
##  Min.   :  0.000         Min.   :0.0000                             
##  1st Qu.:  1.000         1st Qu.:0.0000                             
##  Median :  8.000         Median :1.0000                             
##  Mean   :  9.434         Mean   :0.9302                             
##  3rd Qu.: 14.600         3rd Qu.:2.0000                             
##  Max.   :100.000         Max.   :6.0000                             
##  nr. days with interval session total km Z3-4    max km Z3-4 one day
##  Min.   :0.000                  Min.   : 0.000   Min.   : 0.000     
##  1st Qu.:0.000                  1st Qu.: 0.000   1st Qu.: 0.000     
##  Median :2.000                  Median : 0.000   Median : 0.000     
##  Mean   :1.673                  Mean   : 4.859   Mean   : 3.457     
##  3rd Qu.:3.000                  3rd Qu.: 8.000   3rd Qu.: 6.300     
##  Max.   :7.000                  Max.   :79.800   Max.   :75.000     
##  total km Z5-T1-T2 max km Z5-T1-T2 one day total hours alternative training
##  Min.   : 0.000    Min.   : 0.000          Min.   : 0.000                  
##  1st Qu.: 0.000    1st Qu.: 0.000          1st Qu.: 0.000                  
##  Median : 1.500    Median : 1.200          Median : 0.000                  
##  Mean   : 4.064    Mean   : 2.725          Mean   : 1.149                  
##  3rd Qu.: 6.300    3rd Qu.: 5.000          3rd Qu.: 1.500                  
##  Max.   :80.000    Max.   :76.000          Max.   :52.500                  
##  nr. strength trainings  avg exertion     min exertion     max exertion   
##  Min.   :0.0000         Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000         1st Qu.:0.1400   1st Qu.:0.0900   1st Qu.:0.1600  
##  Median :1.0000         Median :0.3200   Median :0.1400   Median :0.5100  
##  Mean   :0.8156         Mean   :0.3199   Mean   :0.1887   Mean   :0.4706  
##  3rd Qu.:1.0000         3rd Qu.:0.4800   3rd Qu.:0.2700   3rd Qu.:0.7300  
##  Max.   :9.0000         Max.   :0.9800   Max.   :0.9800   Max.   :1.0000  
##  avg training success min training success max training success
##  Min.   :0.0000       Min.   :0.0000       Min.   :0.0000      
##  1st Qu.:0.0000       1st Qu.:0.0000       1st Qu.:0.0000      
##  Median :0.5900       Median :0.3700       Median :0.7300      
##  Mean   :0.4475       Mean   :0.3384       Mean   :0.5251      
##  3rd Qu.:0.7300       3rd Qu.:0.6100       3rd Qu.:0.8400      
##  Max.   :1.0000       Max.   :1.0000       Max.   :1.0000      
##   avg recovery     min recovery     max recovery    nr. sessions.1  
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   : 0.000  
##  1st Qu.:0.1500   1st Qu.:0.1100   1st Qu.:0.1700   1st Qu.: 5.000  
##  Median :0.2200   Median :0.1600   Median :0.3100   Median : 6.000  
##  Mean   :0.2555   Mean   :0.1856   Mean   :0.3442   Mean   : 5.808  
##  3rd Qu.:0.3600   3rd Qu.:0.2500   3rd Qu.:0.5200   3rd Qu.: 7.000  
##  Max.   :0.9000   Max.   :0.9000   Max.   :1.0000   Max.   :14.000  
##  nr. rest days.1  total kms.1     max km one day.1 total km Z3-Z4-Z5-T1-T2.1
##  Min.   :0.000   Min.   :  0.00   Min.   :  0.00   Min.   :  0.000          
##  1st Qu.:1.000   1st Qu.: 22.20   1st Qu.:  8.80   1st Qu.:  0.800          
##  Median :1.000   Median : 44.40   Median : 13.30   Median :  8.000          
##  Mean   :1.879   Mean   : 49.26   Mean   : 13.92   Mean   :  9.384          
##  3rd Qu.:3.000   3rd Qu.: 70.00   3rd Qu.: 18.30   3rd Qu.: 14.500          
##  Max.   :7.000   Max.   :235.00   Max.   :130.00   Max.   :106.200          
##  nr. tough sessions (effort in Z5, T1 or T2).1 nr. days with interval session.1
##  Min.   :0.0000                                Min.   :0.000                   
##  1st Qu.:0.0000                                1st Qu.:0.000                   
##  Median :1.0000                                Median :2.000                   
##  Mean   :0.9247                                Mean   :1.664                   
##  3rd Qu.:2.0000                                3rd Qu.:3.000                   
##  Max.   :6.0000                                Max.   :7.000                   
##  total km Z3-4.1  max km Z3-4 one day.1 total km Z5-T1-T2.1
##  Min.   : 0.000   Min.   : 0.000        Min.   : 0.000     
##  1st Qu.: 0.000   1st Qu.: 0.000        1st Qu.: 0.000     
##  Median : 0.000   Median : 0.000        Median : 1.400     
##  Mean   : 4.841   Mean   : 3.438        Mean   : 4.022     
##  3rd Qu.: 8.000   3rd Qu.: 6.300        3rd Qu.: 6.200     
##  Max.   :85.000   Max.   :75.000        Max.   :80.000     
##  max km Z5-T1-T2 one day.1 total hours alternative training.1
##  Min.   : 0.000            Min.   : 0.000                    
##  1st Qu.: 0.000            1st Qu.: 0.000                    
##  Median : 1.000            Median : 0.000                    
##  Mean   : 2.693            Mean   : 1.172                    
##  3rd Qu.: 5.000            3rd Qu.: 1.500                    
##  Max.   :76.000            Max.   :52.500                    
##  nr. strength trainings.1 avg exertion.1   min exertion.1   max exertion.1  
##  Min.   :0.0000           Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000           1st Qu.:0.1400   1st Qu.:0.0900   1st Qu.:0.1600  
##  Median :1.0000           Median :0.3200   Median :0.1400   Median :0.5100  
##  Mean   :0.8182           Mean   :0.3196   Mean   :0.1881   Mean   :0.4701  
##  3rd Qu.:1.0000           3rd Qu.:0.4800   3rd Qu.:0.2700   3rd Qu.:0.7300  
##  Max.   :9.0000           Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  avg training success.1 min training success.1 max training success.1
##  Min.   :0.0000         Min.   :0.0000         Min.   :0.0000        
##  1st Qu.:0.0000         1st Qu.:0.0000         1st Qu.:0.0000        
##  Median :0.5900         Median :0.3800         Median :0.7300        
##  Mean   :0.4481         Mean   :0.3398         Mean   :0.5254        
##  3rd Qu.:0.7300         3rd Qu.:0.6100         3rd Qu.:0.8400        
##  Max.   :1.0000         Max.   :1.0000         Max.   :1.0000        
##  avg recovery.1   min recovery.1   max recovery.1   nr. sessions.2  
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   : 0.000  
##  1st Qu.:0.1500   1st Qu.:0.1100   1st Qu.:0.1700   1st Qu.: 5.000  
##  Median :0.2200   Median :0.1600   Median :0.3100   Median : 6.000  
##  Mean   :0.2551   Mean   :0.1851   Mean   :0.3436   Mean   : 5.811  
##  3rd Qu.:0.3600   3rd Qu.:0.2500   3rd Qu.:0.5200   3rd Qu.: 7.000  
##  Max.   :0.9000   Max.   :0.9000   Max.   :1.0000   Max.   :14.000  
##  nr. rest days.2  total kms.2     max km one day.2 total km Z3-Z4-Z5-T1-T2.2
##  Min.   :0.000   Min.   :  0.00   Min.   :  0.00   Min.   :  0.000          
##  1st Qu.:1.000   1st Qu.: 21.70   1st Qu.:  8.60   1st Qu.:  0.200          
##  Median :1.000   Median : 43.90   Median : 13.20   Median :  7.900          
##  Mean   :1.884   Mean   : 48.81   Mean   : 13.82   Mean   :  9.297          
##  3rd Qu.:3.000   3rd Qu.: 69.60   3rd Qu.: 18.20   3rd Qu.: 14.500          
##  Max.   :7.000   Max.   :235.00   Max.   :135.00   Max.   :150.000          
##  nr. tough sessions (effort in Z5, T1 or T2).2 nr. days with interval session.2
##  Min.   :0.0000                                Min.   :0.000                   
##  1st Qu.:0.0000                                1st Qu.:0.000                   
##  Median :1.0000                                Median :2.000                   
##  Mean   :0.9153                                Mean   :1.653                   
##  3rd Qu.:2.0000                                3rd Qu.:3.000                   
##  Max.   :6.0000                                Max.   :7.000                   
##  total km Z3-4.2  max km Z3-4 one day.2 total km Z5-T1-T2.2
##  Min.   : 0.000   Min.   : 0.000        Min.   : 0.000     
##  1st Qu.: 0.000   1st Qu.: 0.000        1st Qu.: 0.000     
##  Median : 0.000   Median : 0.000        Median : 1.000     
##  Mean   : 4.793   Mean   : 3.399        Mean   : 3.977     
##  3rd Qu.: 8.000   3rd Qu.: 6.200        3rd Qu.: 6.100     
##  Max.   :85.000   Max.   :75.000        Max.   :52.200     
##  max km Z5-T1-T2 one day.2 total hours alternative training.2
##  Min.   : 0.000            Min.   : 0.000                    
##  1st Qu.: 0.000            1st Qu.: 0.000                    
##  Median : 1.000            Median : 0.000                    
##  Mean   : 2.665            Mean   : 1.204                    
##  3rd Qu.: 4.900            3rd Qu.: 1.580                    
##  Max.   :30.000            Max.   :67.330                    
##  nr. strength trainings.2 avg exertion.2 min exertion.2   max exertion.2
##  Min.   :0.0000           Min.   :0.00   Min.   :0.0000   Min.   :0.00  
##  1st Qu.:0.0000           1st Qu.:0.14   1st Qu.:0.0900   1st Qu.:0.16  
##  Median :1.0000           Median :0.32   Median :0.1400   Median :0.51  
##  Mean   :0.8244           Mean   :0.32   Mean   :0.1878   Mean   :0.47  
##  3rd Qu.:1.0000           3rd Qu.:0.49   3rd Qu.:0.2700   3rd Qu.:0.73  
##  Max.   :9.0000           Max.   :0.98   Max.   :0.9800   Max.   :1.00  
##  avg training success.2 min training success.2 max training success.2
##  Min.   :0.0000         Min.   :0.0000         Min.   :0.0000        
##  1st Qu.:0.0000         1st Qu.:0.0000         1st Qu.:0.0000        
##  Median :0.5900         Median :0.3800         Median :0.7300        
##  Mean   :0.4481         Mean   :0.3396         Mean   :0.5257        
##  3rd Qu.:0.7300         3rd Qu.:0.6100         3rd Qu.:0.8400        
##  Max.   :1.0000         Max.   :1.0000         Max.   :1.0000        
##  avg recovery.2   min recovery.2   max recovery.2     Athlete ID   
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   : 0.00  
##  1st Qu.:0.1500   1st Qu.:0.1100   1st Qu.:0.1700   1st Qu.:20.00  
##  Median :0.2200   Median :0.1600   Median :0.3100   Median :34.00  
##  Mean   :0.2551   Mean   :0.1849   Mean   :0.3435   Mean   :34.54  
##  3rd Qu.:0.3600   3rd Qu.:0.2400   3rd Qu.:0.5200   3rd Qu.:50.00  
##  Max.   :0.9000   Max.   :0.9000   Max.   :1.0000   Max.   :73.00  
##      injury        rel total kms week 0_1 rel total kms week 0_2
##  Min.   :0.00000   Min.   :        0      Min.   :        0     
##  1st Qu.:0.00000   1st Qu.:        1      1st Qu.:        1     
##  Median :0.00000   Median :        1      Median :        1     
##  Mean   :0.01344   Mean   :   440863      Mean   :   901468     
##  3rd Qu.:0.00000   3rd Qu.:        1      3rd Qu.:        1     
##  Max.   :1.00000   Max.   :209600000      Max.   :217600000     
##  rel total kms week 1_2      Date     
##  Min.   :        0      Min.   :   0  
##  1st Qu.:        1      1st Qu.: 437  
##  Median :        1      Median :1254  
##  Mean   :   480362      Mean   :1228  
##  3rd Qu.:        1      3rd Qu.:1913  
##  Max.   :209600000      Max.   :2673
noninjured <-week[week$injury == 0,]
summary(noninjured)
##   nr. sessions    nr. rest days     total kms      max km one day 
##  Min.   : 0.000   Min.   :0.000   Min.   :  0.00   Min.   :  0.0  
##  1st Qu.: 5.000   1st Qu.:1.000   1st Qu.: 22.60   1st Qu.:  9.0  
##  Median : 6.000   Median :1.000   Median : 44.70   Median : 13.4  
##  Mean   : 5.801   Mean   :1.882   Mean   : 49.51   Mean   : 14.0  
##  3rd Qu.: 7.000   3rd Qu.:3.000   3rd Qu.: 70.10   3rd Qu.: 18.3  
##  Max.   :14.000   Max.   :7.000   Max.   :242.00   Max.   :131.0  
##  total km Z3-Z4-Z5-T1-T2 nr. tough sessions (effort in Z5, T1 or T2)
##  Min.   :  0.000         Min.   :0.0000                             
##  1st Qu.:  1.000         1st Qu.:0.0000                             
##  Median :  8.000         Median :1.0000                             
##  Mean   :  9.405         Mean   :0.9272                             
##  3rd Qu.: 14.500         3rd Qu.:2.0000                             
##  Max.   :100.000         Max.   :6.0000                             
##  nr. days with interval session total km Z3-4    max km Z3-4 one day
##  Min.   :0.000                  Min.   : 0.000   Min.   : 0.00      
##  1st Qu.:0.000                  1st Qu.: 0.000   1st Qu.: 0.00      
##  Median :2.000                  Median : 0.000   Median : 0.00      
##  Mean   :1.668                  Mean   : 4.849   Mean   : 3.45      
##  3rd Qu.:3.000                  3rd Qu.: 8.000   3rd Qu.: 6.30      
##  Max.   :7.000                  Max.   :79.800   Max.   :75.00      
##  total km Z5-T1-T2 max km Z5-T1-T2 one day total hours alternative training
##  Min.   : 0.000    Min.   : 0.000          Min.   : 0.000                  
##  1st Qu.: 0.000    1st Qu.: 0.000          1st Qu.: 0.000                  
##  Median : 1.500    Median : 1.000          Median : 0.000                  
##  Mean   : 4.049    Mean   : 2.717          Mean   : 1.149                  
##  3rd Qu.: 6.300    3rd Qu.: 5.000          3rd Qu.: 1.500                  
##  Max.   :80.000    Max.   :76.000          Max.   :52.500                  
##  nr. strength trainings  avg exertion     min exertion     max exertion   
##  Min.   :0.0000         Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000         1st Qu.:0.1400   1st Qu.:0.0900   1st Qu.:0.1600  
##  Median :1.0000         Median :0.3200   Median :0.1400   Median :0.5100  
##  Mean   :0.8123         Mean   :0.3188   Mean   :0.1882   Mean   :0.4688  
##  3rd Qu.:1.0000         3rd Qu.:0.4800   3rd Qu.:0.2700   3rd Qu.:0.7300  
##  Max.   :9.0000         Max.   :0.9800   Max.   :0.9800   Max.   :1.0000  
##  avg training success min training success max training success
##  Min.   :0.0000       Min.   :0.0000       Min.   :0.0000      
##  1st Qu.:0.0000       1st Qu.:0.0000       1st Qu.:0.0000      
##  Median :0.5900       Median :0.3700       Median :0.7300      
##  Mean   :0.4461       Mean   :0.3376       Mean   :0.5233      
##  3rd Qu.:0.7300       3rd Qu.:0.6100       3rd Qu.:0.8400      
##  Max.   :1.0000       Max.   :1.0000       Max.   :1.0000      
##   avg recovery     min recovery     max recovery    nr. sessions.1  
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   : 0.000  
##  1st Qu.:0.1500   1st Qu.:0.1100   1st Qu.:0.1700   1st Qu.: 5.000  
##  Median :0.2200   Median :0.1600   Median :0.3100   Median : 6.000  
##  Mean   :0.2549   Mean   :0.1854   Mean   :0.3431   Mean   : 5.802  
##  3rd Qu.:0.3600   3rd Qu.:0.2500   3rd Qu.:0.5200   3rd Qu.: 7.000  
##  Max.   :0.9000   Max.   :0.9000   Max.   :1.0000   Max.   :14.000  
##  nr. rest days.1  total kms.1     max km one day.1 total km Z3-Z4-Z5-T1-T2.1
##  Min.   :0.000   Min.   :  0.00   Min.   : 0.00    Min.   :  0.000          
##  1st Qu.:1.000   1st Qu.: 22.10   1st Qu.: 8.80    1st Qu.:  0.800          
##  Median :1.000   Median : 44.40   Median :13.30    Median :  8.000          
##  Mean   :1.884   Mean   : 49.23   Mean   :13.91    Mean   :  9.367          
##  3rd Qu.:3.000   3rd Qu.: 70.00   3rd Qu.:18.30    3rd Qu.: 14.500          
##  Max.   :7.000   Max.   :235.00   Max.   :90.00    Max.   :106.200          
##  nr. tough sessions (effort in Z5, T1 or T2).1 nr. days with interval session.1
##  Min.   :0.0000                                Min.   :0.000                   
##  1st Qu.:0.0000                                1st Qu.:0.000                   
##  Median :1.0000                                Median :2.000                   
##  Mean   :0.9225                                Mean   :1.661                   
##  3rd Qu.:2.0000                                3rd Qu.:3.000                   
##  Max.   :6.0000                                Max.   :7.000                   
##  total km Z3-4.1  max km Z3-4 one day.1 total km Z5-T1-T2.1
##  Min.   : 0.000   Min.   : 0.000        Min.   : 0.000     
##  1st Qu.: 0.000   1st Qu.: 0.000        1st Qu.: 0.000     
##  Median : 0.000   Median : 0.000        Median : 1.200     
##  Mean   : 4.836   Mean   : 3.434        Mean   : 4.012     
##  3rd Qu.: 8.000   3rd Qu.: 6.300        3rd Qu.: 6.200     
##  Max.   :85.000   Max.   :75.000        Max.   :80.000     
##  max km Z5-T1-T2 one day.1 total hours alternative training.1
##  Min.   : 0.000            Min.   : 0.000                    
##  1st Qu.: 0.000            1st Qu.: 0.000                    
##  Median : 1.000            Median : 0.000                    
##  Mean   : 2.688            Mean   : 1.173                    
##  3rd Qu.: 5.000            3rd Qu.: 1.500                    
##  Max.   :76.000            Max.   :52.500                    
##  nr. strength trainings.1 avg exertion.1   min exertion.1   max exertion.1  
##  Min.   :0.0000           Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000           1st Qu.:0.1400   1st Qu.:0.0900   1st Qu.:0.1600  
##  Median :1.0000           Median :0.3200   Median :0.1400   Median :0.5100  
##  Mean   :0.8148           Mean   :0.3186   Mean   :0.1876   Mean   :0.4685  
##  3rd Qu.:1.0000           3rd Qu.:0.4800   3rd Qu.:0.2700   3rd Qu.:0.7300  
##  Max.   :9.0000           Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  avg training success.1 min training success.1 max training success.1
##  Min.   :0.0000         Min.   :0.0000         Min.   :0.0000        
##  1st Qu.:0.0000         1st Qu.:0.0000         1st Qu.:0.0000        
##  Median :0.5900         Median :0.3800         Median :0.7300        
##  Mean   :0.4468         Mean   :0.3389         Mean   :0.5237        
##  3rd Qu.:0.7300         3rd Qu.:0.6100         3rd Qu.:0.8400        
##  Max.   :1.0000         Max.   :1.0000         Max.   :1.0000        
##  avg recovery.1   min recovery.1   max recovery.1   nr. sessions.2  
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   : 0.000  
##  1st Qu.:0.1500   1st Qu.:0.1100   1st Qu.:0.1700   1st Qu.: 5.000  
##  Median :0.2200   Median :0.1600   Median :0.3100   Median : 6.000  
##  Mean   :0.2546   Mean   :0.1849   Mean   :0.3427   Mean   : 5.807  
##  3rd Qu.:0.3600   3rd Qu.:0.2400   3rd Qu.:0.5100   3rd Qu.: 7.000  
##  Max.   :0.9000   Max.   :0.9000   Max.   :1.0000   Max.   :14.000  
##  nr. rest days.2  total kms.2     max km one day.2 total km Z3-Z4-Z5-T1-T2.2
##  Min.   :0.000   Min.   :  0.00   Min.   :  0.00   Min.   :  0.000          
##  1st Qu.:1.000   1st Qu.: 21.60   1st Qu.:  8.60   1st Qu.:  0.000          
##  Median :1.000   Median : 43.90   Median : 13.20   Median :  7.900          
##  Mean   :1.887   Mean   : 48.82   Mean   : 13.81   Mean   :  9.291          
##  3rd Qu.:3.000   3rd Qu.: 69.70   3rd Qu.: 18.20   3rd Qu.: 14.500          
##  Max.   :7.000   Max.   :235.00   Max.   :135.00   Max.   :150.000          
##  nr. tough sessions (effort in Z5, T1 or T2).2 nr. days with interval session.2
##  Min.   :0.0000                                Min.   :0.000                   
##  1st Qu.:0.0000                                1st Qu.:0.000                   
##  Median :1.0000                                Median :2.000                   
##  Mean   :0.9143                                Mean   :1.652                   
##  3rd Qu.:2.0000                                3rd Qu.:3.000                   
##  Max.   :6.0000                                Max.   :7.000                   
##  total km Z3-4.2  max km Z3-4 one day.2 total km Z5-T1-T2.2
##  Min.   : 0.000   Min.   : 0.000        Min.   : 0.000     
##  1st Qu.: 0.000   1st Qu.: 0.000        1st Qu.: 0.000     
##  Median : 0.000   Median : 0.000        Median : 1.000     
##  Mean   : 4.793   Mean   : 3.398        Mean   : 3.973     
##  3rd Qu.: 8.000   3rd Qu.: 6.200        3rd Qu.: 6.100     
##  Max.   :85.000   Max.   :75.000        Max.   :52.200     
##  max km Z5-T1-T2 one day.2 total hours alternative training.2
##  Min.   : 0.000            Min.   : 0.000                    
##  1st Qu.: 0.000            1st Qu.: 0.000                    
##  Median : 1.000            Median : 0.000                    
##  Mean   : 2.662            Mean   : 1.204                    
##  3rd Qu.: 4.900            3rd Qu.: 1.580                    
##  Max.   :30.000            Max.   :67.330                    
##  nr. strength trainings.2 avg exertion.2   min exertion.2   max exertion.2  
##  Min.   :0.0000           Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000           1st Qu.:0.1400   1st Qu.:0.0900   1st Qu.:0.1600  
##  Median :1.0000           Median :0.3200   Median :0.1400   Median :0.5100  
##  Mean   :0.8223           Mean   :0.3191   Mean   :0.1873   Mean   :0.4686  
##  3rd Qu.:1.0000           3rd Qu.:0.4800   3rd Qu.:0.2700   3rd Qu.:0.7300  
##  Max.   :9.0000           Max.   :0.9800   Max.   :0.9800   Max.   :1.0000  
##  avg training success.2 min training success.2 max training success.2
##  Min.   :0.0000         Min.   :0.0000         Min.   :0.0000        
##  1st Qu.:0.0000         1st Qu.:0.0000         1st Qu.:0.0000        
##  Median :0.5900         Median :0.3700         Median :0.7300        
##  Mean   :0.4469         Mean   :0.3388         Mean   :0.5241        
##  3rd Qu.:0.7300         3rd Qu.:0.6100         3rd Qu.:0.8400        
##  Max.   :1.0000         Max.   :1.0000         Max.   :1.0000        
##  avg recovery.2   min recovery.2   max recovery.2     Athlete ID        injury 
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   : 0.00   Min.   :0  
##  1st Qu.:0.1500   1st Qu.:0.1100   1st Qu.:0.1700   1st Qu.:20.00   1st Qu.:0  
##  Median :0.2200   Median :0.1600   Median :0.3100   Median :34.00   Median :0  
##  Mean   :0.2546   Mean   :0.1847   Mean   :0.3426   Mean   :34.53   Mean   :0  
##  3rd Qu.:0.3600   3rd Qu.:0.2400   3rd Qu.:0.5100   3rd Qu.:50.00   3rd Qu.:0  
##  Max.   :0.9000   Max.   :0.9000   Max.   :1.0000   Max.   :73.00   Max.   :0  
##  rel total kms week 0_1 rel total kms week 0_2 rel total kms week 1_2
##  Min.   :        0      Min.   :        0      Min.   :        0     
##  1st Qu.:        1      1st Qu.:        1      1st Qu.:        1     
##  Median :        1      Median :        1      Median :        1     
##  Mean   :   442719      Mean   :   899340      Mean   :   480452     
##  3rd Qu.:        1      3rd Qu.:        1      3rd Qu.:        1     
##  Max.   :209600000      Max.   :217600000      Max.   :209600000     
##       Date     
##  Min.   :   0  
##  1st Qu.: 429  
##  Median :1251  
##  Mean   :1225  
##  3rd Qu.:1909  
##  Max.   :2652
injured <- week[week$injury == 1, ]
summary(injured)
##   nr. sessions    nr. rest days     total kms      max km one day 
##  Min.   : 0.000   Min.   :0.000   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 5.000   1st Qu.:0.000   1st Qu.: 30.70   1st Qu.:11.00  
##  Median : 6.000   Median :1.000   Median : 49.00   Median :14.30  
##  Mean   : 6.435   Mean   :1.353   Mean   : 51.86   Mean   :14.77  
##  3rd Qu.: 8.000   3rd Qu.:2.000   3rd Qu.: 69.00   3rd Qu.:18.00  
##  Max.   :13.000   Max.   :7.000   Max.   :174.00   Max.   :57.00  
##  total km Z3-Z4-Z5-T1-T2 nr. tough sessions (effort in Z5, T1 or T2)
##  Min.   : 0.00           Min.   :0.000                              
##  1st Qu.: 5.90           1st Qu.:0.000                              
##  Median :10.90           Median :1.000                              
##  Mean   :11.55           Mean   :1.146                              
##  3rd Qu.:16.50           3rd Qu.:2.000                              
##  Max.   :45.50           Max.   :4.000                              
##  nr. days with interval session total km Z3-4    max km Z3-4 one day
##  Min.   :0.000                  Min.   : 0.000   Min.   : 0.000     
##  1st Qu.:1.000                  1st Qu.: 0.000   1st Qu.: 0.000     
##  Median :2.000                  Median : 3.800   Median : 3.500     
##  Mean   :1.991                  Mean   : 5.593   Mean   : 3.939     
##  3rd Qu.:3.000                  3rd Qu.: 9.000   3rd Qu.: 7.000     
##  Max.   :6.000                  Max.   :35.400   Max.   :21.100     
##  total km Z5-T1-T2 max km Z5-T1-T2 one day total hours alternative training
##  Min.   : 0.000    Min.   : 0.000          Min.   : 0.000                  
##  1st Qu.: 0.000    1st Qu.: 0.000          1st Qu.: 0.000                  
##  Median : 4.000    Median : 3.600          Median : 0.000                  
##  Mean   : 5.179    Mean   : 3.358          Mean   : 1.142                  
##  3rd Qu.: 8.150    3rd Qu.: 6.000          3rd Qu.: 1.500                  
##  Max.   :32.300    Max.   :15.000          Max.   :27.420                  
##  nr. strength trainings  avg exertion     min exertion     max exertion   
##  Min.   :0.000          Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.000          1st Qu.:0.2400   1st Qu.:0.1000   1st Qu.:0.4100  
##  Median :1.000          Median :0.4300   Median :0.1800   Median :0.6900  
##  Mean   :1.061          Mean   :0.4043   Mean   :0.2213   Mean   :0.6029  
##  3rd Qu.:2.000          3rd Qu.:0.5600   3rd Qu.:0.3100   3rd Qu.:0.8050  
##  Max.   :5.000          Max.   :0.8500   Max.   :0.7000   Max.   :1.0000  
##  avg training success min training success max training success
##  Min.   :0.0000       Min.   :0.0000       Min.   :0.0000      
##  1st Qu.:0.5100       1st Qu.:0.1450       1st Qu.:0.6500      
##  Median :0.6600       Median :0.4600       Median :0.7800      
##  Mean   :0.5507       Mean   :0.3977       Mean   :0.6539      
##  3rd Qu.:0.7400       3rd Qu.:0.6200       3rd Qu.:0.8700      
##  Max.   :0.9800       Max.   :0.9600       Max.   :1.0000      
##   avg recovery     min recovery     max recovery    nr. sessions.1  
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   : 0.000  
##  1st Qu.:0.1800   1st Qu.:0.1200   1st Qu.:0.2150   1st Qu.: 5.000  
##  Median :0.2800   Median :0.1800   Median :0.4100   Median : 6.000  
##  Mean   :0.2997   Mean   :0.2031   Mean   :0.4254   Mean   : 6.294  
##  3rd Qu.:0.4000   3rd Qu.:0.2700   3rd Qu.:0.5900   3rd Qu.: 8.000  
##  Max.   :0.8600   Max.   :0.7000   Max.   :1.0000   Max.   :14.000  
##  nr. rest days.1  total kms.1     max km one day.1 total km Z3-Z4-Z5-T1-T2.1
##  Min.   :0.000   Min.   :  0.00   Min.   :  0.00   Min.   : 0.00            
##  1st Qu.:1.000   1st Qu.: 30.55   1st Qu.: 10.50   1st Qu.: 4.80            
##  Median :1.000   Median : 47.30   Median : 14.00   Median :10.00            
##  Mean   :1.482   Mean   : 51.19   Mean   : 14.73   Mean   :10.65            
##  3rd Qu.:2.000   3rd Qu.: 66.85   3rd Qu.: 18.00   3rd Qu.:15.50            
##  Max.   :7.000   Max.   :202.00   Max.   :130.00   Max.   :54.60            
##  nr. tough sessions (effort in Z5, T1 or T2).1 nr. days with interval session.1
##  Min.   :0.000                                 Min.   :0.000                   
##  1st Qu.:0.000                                 1st Qu.:1.000                   
##  Median :1.000                                 Median :2.000                   
##  Mean   :1.087                                 Mean   :1.875                   
##  3rd Qu.:2.000                                 3rd Qu.:3.000                   
##  Max.   :4.000                                 Max.   :5.000                   
##  total km Z3-4.1 max km Z3-4 one day.1 total km Z5-T1-T2.1
##  Min.   : 0.00   Min.   : 0.000        Min.   : 0.000     
##  1st Qu.: 0.00   1st Qu.: 0.000        1st Qu.: 0.000     
##  Median : 2.50   Median : 2.400        Median : 3.200     
##  Mean   : 5.22   Mean   : 3.705        Mean   : 4.732     
##  3rd Qu.: 8.50   3rd Qu.: 6.500        3rd Qu.: 7.850     
##  Max.   :34.70   Max.   :21.100        Max.   :42.100     
##  max km Z5-T1-T2 one day.1 total hours alternative training.1
##  Min.   : 0.000            Min.   : 0.000                    
##  1st Qu.: 0.000            1st Qu.: 0.000                    
##  Median : 3.000            Median : 0.000                    
##  Mean   : 3.075            Mean   : 1.057                    
##  3rd Qu.: 5.600            3rd Qu.: 1.500                    
##  Max.   :21.100            Max.   :12.000                    
##  nr. strength trainings.1 avg exertion.1   min exertion.1   max exertion.1  
##  Min.   :0.000            Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.000            1st Qu.:0.2150   1st Qu.:0.1100   1st Qu.:0.3400  
##  Median :1.000            Median :0.4200   Median :0.1800   Median :0.6800  
##  Mean   :1.071            Mean   :0.3946   Mean   :0.2211   Mean   :0.5871  
##  3rd Qu.:2.000            3rd Qu.:0.5400   3rd Qu.:0.3100   3rd Qu.:0.8000  
##  Max.   :5.000            Max.   :0.9100   Max.   :0.8800   Max.   :1.0000  
##  avg training success.1 min training success.1 max training success.1
##  Min.   :0.0000         Min.   :0.0000         Min.   :0.0000        
##  1st Qu.:0.4800         1st Qu.:0.1350         1st Qu.:0.6350        
##  Median :0.6500         Median :0.4600         Median :0.7800        
##  Mean   :0.5454         Mean   :0.4016         Mean   :0.6457        
##  3rd Qu.:0.7400         3rd Qu.:0.6200         3rd Qu.:0.8600        
##  Max.   :1.0000         Max.   :1.0000         Max.   :1.0000        
##  avg recovery.1   min recovery.1   max recovery.1   nr. sessions.2 
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   : 0.00  
##  1st Qu.:0.1700   1st Qu.:0.1300   1st Qu.:0.2000   1st Qu.: 5.00  
##  Median :0.2800   Median :0.1800   Median :0.3800   Median : 6.00  
##  Mean   :0.2953   Mean   :0.2029   Mean   :0.4123   Mean   : 6.07  
##  3rd Qu.:0.4000   3rd Qu.:0.2700   3rd Qu.:0.5900   3rd Qu.: 7.00  
##  Max.   :0.7300   Max.   :0.6200   Max.   :1.0000   Max.   :14.00  
##  nr. rest days.2  total kms.2     max km one day.2 total km Z3-Z4-Z5-T1-T2.2
##  Min.   :0.000   Min.   :  0.00   Min.   : 0.00    Min.   : 0.000           
##  1st Qu.:1.000   1st Qu.: 26.90   1st Qu.: 9.75    1st Qu.: 3.000           
##  Median :1.000   Median : 42.50   Median :13.40    Median : 9.000           
##  Mean   :1.652   Mean   : 47.69   Mean   :13.91    Mean   : 9.737           
##  3rd Qu.:2.000   3rd Qu.: 66.80   3rd Qu.:17.75    3rd Qu.:14.750           
##  Max.   :7.000   Max.   :191.00   Max.   :52.90    Max.   :59.800           
##  nr. tough sessions (effort in Z5, T1 or T2).2 nr. days with interval session.2
##  Min.   :0.0000                                Min.   :0.000                   
##  1st Qu.:0.0000                                1st Qu.:1.000                   
##  Median :1.0000                                Median :2.000                   
##  Mean   :0.9913                                Mean   :1.729                   
##  3rd Qu.:2.0000                                3rd Qu.:3.000                   
##  Max.   :4.0000                                Max.   :5.000                   
##  total km Z3-4.2  max km Z3-4 one day.2 total km Z5-T1-T2.2
##  Min.   : 0.000   Min.   : 0.00         Min.   : 0.000     
##  1st Qu.: 0.000   1st Qu.: 0.00         1st Qu.: 0.000     
##  Median : 0.000   Median : 0.00         Median : 3.000     
##  Mean   : 4.818   Mean   : 3.48         Mean   : 4.325     
##  3rd Qu.: 8.000   3rd Qu.: 6.50         3rd Qu.: 6.550     
##  Max.   :39.000   Max.   :30.00         Max.   :30.000     
##  max km Z5-T1-T2 one day.2 total hours alternative training.2
##  Min.   : 0.000            Min.   : 0.000                    
##  1st Qu.: 0.000            1st Qu.: 0.000                    
##  Median : 2.500            Median : 0.000                    
##  Mean   : 2.881            Mean   : 1.212                    
##  3rd Qu.: 5.000            3rd Qu.: 1.615                    
##  Max.   :20.000            Max.   :21.780                    
##  nr. strength trainings.2 avg exertion.2   min exertion.2   max exertion.2  
##  Min.   :0.0000           Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000           1st Qu.:0.1800   1st Qu.:0.1000   1st Qu.:0.2800  
##  Median :1.0000           Median :0.4200   Median :0.1900   Median :0.6800  
##  Mean   :0.9791           Mean   :0.3873   Mean   :0.2203   Mean   :0.5748  
##  3rd Qu.:2.0000           3rd Qu.:0.5500   3rd Qu.:0.3200   3rd Qu.:0.7900  
##  Max.   :6.0000           Max.   :0.7800   Max.   :0.7300   Max.   :1.0000  
##  avg training success.2 min training success.2 max training success.2
##  Min.   :0.000          Min.   :0.000          Min.   :0.0000        
##  1st Qu.:0.480          1st Qu.:0.115          1st Qu.:0.6300        
##  Median :0.650          Median :0.470          Median :0.7900        
##  Mean   :0.541          Mean   :0.397          Mean   :0.6412        
##  3rd Qu.:0.740          3rd Qu.:0.620          3rd Qu.:0.8700        
##  Max.   :1.000          Max.   :1.000          Max.   :1.0000        
##  avg recovery.2   min recovery.2 max recovery.2     Athlete ID        injury 
##  Min.   :0.0000   Min.   :0.00   Min.   :0.0000   Min.   : 0.00   Min.   :1  
##  1st Qu.:0.1700   1st Qu.:0.12   1st Qu.:0.1950   1st Qu.:23.00   1st Qu.:1  
##  Median :0.2800   Median :0.17   Median :0.4000   Median :36.00   Median :1  
##  Mean   :0.2916   Mean   :0.20   Mean   :0.4076   Mean   :35.42   Mean   :1  
##  3rd Qu.:0.4000   3rd Qu.:0.27   3rd Qu.:0.5700   3rd Qu.:50.00   3rd Qu.:1  
##  Max.   :0.7500   Max.   :0.69   Max.   :1.0000   Max.   :71.00   Max.   :1  
##  rel total kms week 0_1 rel total kms week 0_2 rel total kms week 1_2
##  Min.   :       0       Min.   :        0      Min.   :       0      
##  1st Qu.:       1       1st Qu.:        1      1st Qu.:       1      
##  Median :       1       Median :        1      Median :       1      
##  Mean   :  304523       Mean   :  1057740      Mean   :  473740      
##  3rd Qu.:       1       3rd Qu.:        2      3rd Qu.:       1      
##  Max.   :47000000       Max.   :100200000      Max.   :53100000      
##       Date       
##  Min.   : 246.0  
##  1st Qu.: 765.5  
##  Median :1485.0  
##  Mean   :1431.6  
##  3rd Qu.:2075.5  
##  Max.   :2673.0
injured$injury = (as.factor(injured$injury))
levels(injured$injury) <- c("Injured")

noninjured$injury = (as.factor(noninjured$injury))
levels(noninjured$injury) <- c("NotInjured")
injury_dist <- week %>%
  group_by(`Athlete ID`, injury) %>%
  summarise(count = n()) %>%
  spread(injury, count, fill = 0) 
## `summarise()` has grouped output by 'Athlete ID'. You can override using the
## `.groups` argument.
injury_dist
## # A tibble: 74 × 3
## # Groups:   Athlete ID [74]
##    `Athlete ID`   `0`   `1`
##           <dbl> <dbl> <dbl>
##  1            0   303     7
##  2            1   459     2
##  3            2  1287     4
##  4            3   341     3
##  5            4   674     7
##  6            5   201     1
##  7            6   530     4
##  8            7   299     5
##  9            8   261     7
## 10            9  1210    22
## # … with 64 more rows
colnames(injury_dist) <- c("Athlete ID", "Not Injured", "Injured")

#p <- ggplot(injury_dist, aes(x=`Athlete ID`, fill=Injured))+ # Set dataset and aesthetics
#  geom_density(alpha = 0.3, fill='red3') 
#p
injury_sum <- week %>%
  subset(week$injury==1,) 
injury_sum <- injury_sum %>%
  group_by(`Athlete ID`, injury) %>%
  mutate(Freq = n()) 
injury_sum
## # A tibble: 575 × 73
## # Groups:   Athlete ID, injury [61]
##    nr. session…¹ nr. r…² total…³ max k…⁴ total…⁵ nr. t…⁶ nr. d…⁷ total…⁸ max k…⁹
##            <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1             6       1    34.9    14.5    10.4       0       2     9.5     5  
##  2             6       1    33.6    15.2    15.3       1       2    12.4     6.4
##  3             1       6     7       7       0         0       0     0       0  
##  4             7       0    47.2    12.5    14.7       0       3    14       6  
##  5             9       1    29.4     7.5     3.9       3       3     0       0  
##  6             7       0    28.5     6.9    10.8       1       4    10       4.5
##  7             3       4    23.4    12       1.9       1       2     1.5     1.5
##  8             6       1   117      22      20         0       2    20      11  
##  9             6       1   114      30      10         1       1     0       0  
## 10             6       2    78      24      15         1       2     9       9  
## # … with 565 more rows, 64 more variables: `total km Z5-T1-T2` <dbl>,
## #   `max km Z5-T1-T2 one day` <dbl>, `total hours alternative training` <dbl>,
## #   `nr. strength trainings` <dbl>, `avg exertion` <dbl>, `min exertion` <dbl>,
## #   `max exertion` <dbl>, `avg training success` <dbl>,
## #   `min training success` <dbl>, `max training success` <dbl>,
## #   `avg recovery` <dbl>, `min recovery` <dbl>, `max recovery` <dbl>,
## #   `nr. sessions.1` <dbl>, `nr. rest days.1` <dbl>, `total kms.1` <dbl>, …
ggplot(injury_sum, aes(x=`Athlete ID`, fill=Freq))+
  geom_bar()
## Warning: The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

# Create plot
p <- ggplot(injury_sum, aes(x = `max exertion`, fill = injury)) + # Set dataset and aesthetics
  geom_density(alpha = 0.3, fill='red3') + 
  labs(x = "Max Exertion", y = "Injuries", # Set labels for plot 
       title = "Injuries by Max Exertion") +
  theme(axis.line = element_line(colour = "black"), # Set axis line as black
        panel.grid.major = element_blank(), # Remove grid
        panel.grid.minor = element_blank(), # Remove grid
        panel.border = element_blank(), # Remove grid
        panel.background = element_blank()) + # Remove grid 
  dark_theme_bw() # Turn theme to dark mode
## Inverted geom defaults of fill and color/colour.
## To change them back, use invert_geom_defaults().
p1 <- ggplot(injury_sum, aes(x = `avg recovery`, fill = injury)) + # Set dataset and aesthetics
  geom_density(alpha = 0.3, fill ="blue2") + 
  labs(x = "Average Recovery", y = "Injuries", # Set labels for plot 
       title = "Injuries by Average Recovery") +
  theme(axis.line = element_line(colour = "black"), # Set axis line as black
        panel.grid.major = element_blank(), # Remove grid
        panel.grid.minor = element_blank(), # Remove grid
        panel.border = element_blank(), # Remove grid
        panel.background = element_blank()) + # Remove grid 
  dark_theme_bw() # Turn theme to dark mode

p <- ggarrange(p, p1, ncol = 2, widths = c(3,3), common.legend = TRUE, legend = "bottom")
print(p)

For Weeks that an Athlete was Injured this was their average exertion

# Create a ggplot boxplot
p <- ggplot(week, aes(x = injury, y = `avg exertion`, fill=injury)) +
  geom_boxplot() 
p
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

# Create an interactive plotly version of the ggplot
ggplotly(p) %>%
  layout(
    title = "Boxplot of Average Exertion by Injury Status",
    xaxis = list(title = "Injury Status"),
    yaxis = list(title = "Average Exertion")
  )
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
p <- week %>%
  ggplot(aes(x = Date)) +
  geom_density(aes( fill = injury), alpha = 0.3) +
  scale_fill_manual(values = c("red2", "green")) +
  labs(title = "Bar Chart and Density Plot of Average Exertion by Injury Status",
       x = "Date", y = "Density", fill = "Injury Status")
p
## Warning: The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

summary(week)
##   nr. sessions    nr. rest days     total kms      max km one day  
##  Min.   : 0.000   Min.   :0.000   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.: 5.000   1st Qu.:1.000   1st Qu.: 22.80   1st Qu.:  9.00  
##  Median : 6.000   Median :1.000   Median : 44.80   Median : 13.40  
##  Mean   : 5.809   Mean   :1.875   Mean   : 49.54   Mean   : 14.01  
##  3rd Qu.: 7.000   3rd Qu.:3.000   3rd Qu.: 70.10   3rd Qu.: 18.30  
##  Max.   :14.000   Max.   :7.000   Max.   :242.00   Max.   :131.00  
##  total km Z3-Z4-Z5-T1-T2 nr. tough sessions (effort in Z5, T1 or T2)
##  Min.   :  0.000         Min.   :0.0000                             
##  1st Qu.:  1.000         1st Qu.:0.0000                             
##  Median :  8.000         Median :1.0000                             
##  Mean   :  9.434         Mean   :0.9302                             
##  3rd Qu.: 14.600         3rd Qu.:2.0000                             
##  Max.   :100.000         Max.   :6.0000                             
##  nr. days with interval session total km Z3-4    max km Z3-4 one day
##  Min.   :0.000                  Min.   : 0.000   Min.   : 0.000     
##  1st Qu.:0.000                  1st Qu.: 0.000   1st Qu.: 0.000     
##  Median :2.000                  Median : 0.000   Median : 0.000     
##  Mean   :1.673                  Mean   : 4.859   Mean   : 3.457     
##  3rd Qu.:3.000                  3rd Qu.: 8.000   3rd Qu.: 6.300     
##  Max.   :7.000                  Max.   :79.800   Max.   :75.000     
##  total km Z5-T1-T2 max km Z5-T1-T2 one day total hours alternative training
##  Min.   : 0.000    Min.   : 0.000          Min.   : 0.000                  
##  1st Qu.: 0.000    1st Qu.: 0.000          1st Qu.: 0.000                  
##  Median : 1.500    Median : 1.200          Median : 0.000                  
##  Mean   : 4.064    Mean   : 2.725          Mean   : 1.149                  
##  3rd Qu.: 6.300    3rd Qu.: 5.000          3rd Qu.: 1.500                  
##  Max.   :80.000    Max.   :76.000          Max.   :52.500                  
##  nr. strength trainings  avg exertion     min exertion     max exertion   
##  Min.   :0.0000         Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000         1st Qu.:0.1400   1st Qu.:0.0900   1st Qu.:0.1600  
##  Median :1.0000         Median :0.3200   Median :0.1400   Median :0.5100  
##  Mean   :0.8156         Mean   :0.3199   Mean   :0.1887   Mean   :0.4706  
##  3rd Qu.:1.0000         3rd Qu.:0.4800   3rd Qu.:0.2700   3rd Qu.:0.7300  
##  Max.   :9.0000         Max.   :0.9800   Max.   :0.9800   Max.   :1.0000  
##  avg training success min training success max training success
##  Min.   :0.0000       Min.   :0.0000       Min.   :0.0000      
##  1st Qu.:0.0000       1st Qu.:0.0000       1st Qu.:0.0000      
##  Median :0.5900       Median :0.3700       Median :0.7300      
##  Mean   :0.4475       Mean   :0.3384       Mean   :0.5251      
##  3rd Qu.:0.7300       3rd Qu.:0.6100       3rd Qu.:0.8400      
##  Max.   :1.0000       Max.   :1.0000       Max.   :1.0000      
##   avg recovery     min recovery     max recovery    nr. sessions.1  
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   : 0.000  
##  1st Qu.:0.1500   1st Qu.:0.1100   1st Qu.:0.1700   1st Qu.: 5.000  
##  Median :0.2200   Median :0.1600   Median :0.3100   Median : 6.000  
##  Mean   :0.2555   Mean   :0.1856   Mean   :0.3442   Mean   : 5.808  
##  3rd Qu.:0.3600   3rd Qu.:0.2500   3rd Qu.:0.5200   3rd Qu.: 7.000  
##  Max.   :0.9000   Max.   :0.9000   Max.   :1.0000   Max.   :14.000  
##  nr. rest days.1  total kms.1     max km one day.1 total km Z3-Z4-Z5-T1-T2.1
##  Min.   :0.000   Min.   :  0.00   Min.   :  0.00   Min.   :  0.000          
##  1st Qu.:1.000   1st Qu.: 22.20   1st Qu.:  8.80   1st Qu.:  0.800          
##  Median :1.000   Median : 44.40   Median : 13.30   Median :  8.000          
##  Mean   :1.879   Mean   : 49.26   Mean   : 13.92   Mean   :  9.384          
##  3rd Qu.:3.000   3rd Qu.: 70.00   3rd Qu.: 18.30   3rd Qu.: 14.500          
##  Max.   :7.000   Max.   :235.00   Max.   :130.00   Max.   :106.200          
##  nr. tough sessions (effort in Z5, T1 or T2).1 nr. days with interval session.1
##  Min.   :0.0000                                Min.   :0.000                   
##  1st Qu.:0.0000                                1st Qu.:0.000                   
##  Median :1.0000                                Median :2.000                   
##  Mean   :0.9247                                Mean   :1.664                   
##  3rd Qu.:2.0000                                3rd Qu.:3.000                   
##  Max.   :6.0000                                Max.   :7.000                   
##  total km Z3-4.1  max km Z3-4 one day.1 total km Z5-T1-T2.1
##  Min.   : 0.000   Min.   : 0.000        Min.   : 0.000     
##  1st Qu.: 0.000   1st Qu.: 0.000        1st Qu.: 0.000     
##  Median : 0.000   Median : 0.000        Median : 1.400     
##  Mean   : 4.841   Mean   : 3.438        Mean   : 4.022     
##  3rd Qu.: 8.000   3rd Qu.: 6.300        3rd Qu.: 6.200     
##  Max.   :85.000   Max.   :75.000        Max.   :80.000     
##  max km Z5-T1-T2 one day.1 total hours alternative training.1
##  Min.   : 0.000            Min.   : 0.000                    
##  1st Qu.: 0.000            1st Qu.: 0.000                    
##  Median : 1.000            Median : 0.000                    
##  Mean   : 2.693            Mean   : 1.172                    
##  3rd Qu.: 5.000            3rd Qu.: 1.500                    
##  Max.   :76.000            Max.   :52.500                    
##  nr. strength trainings.1 avg exertion.1   min exertion.1   max exertion.1  
##  Min.   :0.0000           Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000           1st Qu.:0.1400   1st Qu.:0.0900   1st Qu.:0.1600  
##  Median :1.0000           Median :0.3200   Median :0.1400   Median :0.5100  
##  Mean   :0.8182           Mean   :0.3196   Mean   :0.1881   Mean   :0.4701  
##  3rd Qu.:1.0000           3rd Qu.:0.4800   3rd Qu.:0.2700   3rd Qu.:0.7300  
##  Max.   :9.0000           Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  avg training success.1 min training success.1 max training success.1
##  Min.   :0.0000         Min.   :0.0000         Min.   :0.0000        
##  1st Qu.:0.0000         1st Qu.:0.0000         1st Qu.:0.0000        
##  Median :0.5900         Median :0.3800         Median :0.7300        
##  Mean   :0.4481         Mean   :0.3398         Mean   :0.5254        
##  3rd Qu.:0.7300         3rd Qu.:0.6100         3rd Qu.:0.8400        
##  Max.   :1.0000         Max.   :1.0000         Max.   :1.0000        
##  avg recovery.1   min recovery.1   max recovery.1   nr. sessions.2  
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   : 0.000  
##  1st Qu.:0.1500   1st Qu.:0.1100   1st Qu.:0.1700   1st Qu.: 5.000  
##  Median :0.2200   Median :0.1600   Median :0.3100   Median : 6.000  
##  Mean   :0.2551   Mean   :0.1851   Mean   :0.3436   Mean   : 5.811  
##  3rd Qu.:0.3600   3rd Qu.:0.2500   3rd Qu.:0.5200   3rd Qu.: 7.000  
##  Max.   :0.9000   Max.   :0.9000   Max.   :1.0000   Max.   :14.000  
##  nr. rest days.2  total kms.2     max km one day.2 total km Z3-Z4-Z5-T1-T2.2
##  Min.   :0.000   Min.   :  0.00   Min.   :  0.00   Min.   :  0.000          
##  1st Qu.:1.000   1st Qu.: 21.70   1st Qu.:  8.60   1st Qu.:  0.200          
##  Median :1.000   Median : 43.90   Median : 13.20   Median :  7.900          
##  Mean   :1.884   Mean   : 48.81   Mean   : 13.82   Mean   :  9.297          
##  3rd Qu.:3.000   3rd Qu.: 69.60   3rd Qu.: 18.20   3rd Qu.: 14.500          
##  Max.   :7.000   Max.   :235.00   Max.   :135.00   Max.   :150.000          
##  nr. tough sessions (effort in Z5, T1 or T2).2 nr. days with interval session.2
##  Min.   :0.0000                                Min.   :0.000                   
##  1st Qu.:0.0000                                1st Qu.:0.000                   
##  Median :1.0000                                Median :2.000                   
##  Mean   :0.9153                                Mean   :1.653                   
##  3rd Qu.:2.0000                                3rd Qu.:3.000                   
##  Max.   :6.0000                                Max.   :7.000                   
##  total km Z3-4.2  max km Z3-4 one day.2 total km Z5-T1-T2.2
##  Min.   : 0.000   Min.   : 0.000        Min.   : 0.000     
##  1st Qu.: 0.000   1st Qu.: 0.000        1st Qu.: 0.000     
##  Median : 0.000   Median : 0.000        Median : 1.000     
##  Mean   : 4.793   Mean   : 3.399        Mean   : 3.977     
##  3rd Qu.: 8.000   3rd Qu.: 6.200        3rd Qu.: 6.100     
##  Max.   :85.000   Max.   :75.000        Max.   :52.200     
##  max km Z5-T1-T2 one day.2 total hours alternative training.2
##  Min.   : 0.000            Min.   : 0.000                    
##  1st Qu.: 0.000            1st Qu.: 0.000                    
##  Median : 1.000            Median : 0.000                    
##  Mean   : 2.665            Mean   : 1.204                    
##  3rd Qu.: 4.900            3rd Qu.: 1.580                    
##  Max.   :30.000            Max.   :67.330                    
##  nr. strength trainings.2 avg exertion.2 min exertion.2   max exertion.2
##  Min.   :0.0000           Min.   :0.00   Min.   :0.0000   Min.   :0.00  
##  1st Qu.:0.0000           1st Qu.:0.14   1st Qu.:0.0900   1st Qu.:0.16  
##  Median :1.0000           Median :0.32   Median :0.1400   Median :0.51  
##  Mean   :0.8244           Mean   :0.32   Mean   :0.1878   Mean   :0.47  
##  3rd Qu.:1.0000           3rd Qu.:0.49   3rd Qu.:0.2700   3rd Qu.:0.73  
##  Max.   :9.0000           Max.   :0.98   Max.   :0.9800   Max.   :1.00  
##  avg training success.2 min training success.2 max training success.2
##  Min.   :0.0000         Min.   :0.0000         Min.   :0.0000        
##  1st Qu.:0.0000         1st Qu.:0.0000         1st Qu.:0.0000        
##  Median :0.5900         Median :0.3800         Median :0.7300        
##  Mean   :0.4481         Mean   :0.3396         Mean   :0.5257        
##  3rd Qu.:0.7300         3rd Qu.:0.6100         3rd Qu.:0.8400        
##  Max.   :1.0000         Max.   :1.0000         Max.   :1.0000        
##  avg recovery.2   min recovery.2   max recovery.2     Athlete ID   
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   : 0.00  
##  1st Qu.:0.1500   1st Qu.:0.1100   1st Qu.:0.1700   1st Qu.:20.00  
##  Median :0.2200   Median :0.1600   Median :0.3100   Median :34.00  
##  Mean   :0.2551   Mean   :0.1849   Mean   :0.3435   Mean   :34.54  
##  3rd Qu.:0.3600   3rd Qu.:0.2400   3rd Qu.:0.5200   3rd Qu.:50.00  
##  Max.   :0.9000   Max.   :0.9000   Max.   :1.0000   Max.   :73.00  
##      injury        rel total kms week 0_1 rel total kms week 0_2
##  Min.   :0.00000   Min.   :        0      Min.   :        0     
##  1st Qu.:0.00000   1st Qu.:        1      1st Qu.:        1     
##  Median :0.00000   Median :        1      Median :        1     
##  Mean   :0.01344   Mean   :   440863      Mean   :   901468     
##  3rd Qu.:0.00000   3rd Qu.:        1      3rd Qu.:        1     
##  Max.   :1.00000   Max.   :209600000      Max.   :217600000     
##  rel total kms week 1_2      Date     
##  Min.   :        0      Min.   :   0  
##  1st Qu.:        1      1st Qu.: 437  
##  Median :        1      Median :1254  
##  Mean   :   480362      Mean   :1228  
##  3rd Qu.:        1      3rd Qu.:1913  
##  Max.   :209600000      Max.   :2673
# distance sum over 3 weeks 
# week 0-2 (Distance in week 0 and the average across 1 and 2)
ac_7_21 <- week$`total kms`/((week$`total kms.1` + week$`total kms.2`)/2)

ac_7_21[is.infinite(ac_7_21)] <- NA
week$ac_7_21 <- ac_7_21
p <- week %>%
  ggplot(aes(x = ac_7_21)) +
  geom_density(aes( fill = factor(injury)), alpha = 0.3) +
  scale_fill_manual(values = c("red2", "green")) +
  labs(title = "Bar Chart and Density Plot of Average Exertion by Injury Status",
       x = "Date", y = "Density", fill = "Injury Status") +
  xlim(0,3)
p
## Warning: Removed 4726 rows containing non-finite values (`stat_density()`).

Derived Variables

ae_7_21 <- week$`avg exertion`/((week$`avg exertion.1` + week$`avg exertion.2`)/2)
ae_7_21[is.infinite(ae_7_21)] <- NA


exer_v_rec_7_21 <- week$`max exertion`/((week$`avg recovery.1`+ week$`avg recovery.2`)/2) ## Exertion 0 / recovery 1 over 3 weeks 
exer_v_rec_7_21[is.infinite(exer_v_rec_7_21)] <- NA

er_7_21 <- week$`avg exertion`/((week$`avg recovery.1` + week$`avg recovery.2`)/2)
er_7_21[is.infinite(er_7_21)] <- NA

rec_maxexer_7_21<- week$`avg recovery`/((week$`max exertion.1` + week$`max exertion.2`)/2)
rec_maxexer_7_21[is.infinite(rec_maxexer_7_21)] <- NA




week$ae_7_21 <- ae_7_21
threshold <- 2.25
week$ae_7_21 <- ifelse(week$ae_7_21 > threshold, mean(week$ae_7_21), week$ae_7_21)
summary(week$ae_7_21)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.8966  1.0000  0.9989  1.1034  2.2500    2857
week$exer_v_rec_7_21 <- exer_v_rec_7_21 
threshold <- 2.25
week$exer_v_rec_7_21 <- ifelse(week$exer_v_rec_7_21 > threshold, mean(week$exer_v_rec_7_21), week$exer_v_rec_7_21)
summary(week$exer_v_rec_7_21)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   1.032   1.308   1.313   1.636   2.250   13929
week$er_7_21 <- er_7_21
threshold <- 2.25
week$er_7_21 <- ifelse(week$er_7_21 > threshold, mean(week$er_7_21), week$er_7_21)
summary(week$er_7_21)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   0.875   1.080   1.125   1.368   2.250    6143
week$rec_maxexer_7_21 <- rec_maxexer_7_21
threshold <- 2.25
week$rec_maxexer_7_21 <- ifelse(week$rec_maxexer_7_21 > threshold, mean(week$rec_maxexer_7_21), week$rec_maxexer_7_21)
summary(week$rec_maxexer_7_21)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.4051  0.6282  0.6550  0.8485  2.2500    2681
model <- week[,c(1:3, 12:14, 16, 17, 20, 68, 73:76)]
model <- na.omit(model)

full_model <- glm(injury ~ ., data = model, family = binomial())
summary(full_model)
## 
## Call:
## glm(formula = injury ~ ., family = binomial(), data = model)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.3103  -0.1743  -0.1444  -0.1230   3.5424  
## 
## Coefficients:
##                                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                        -3.58411    0.57505  -6.233 4.59e-10 ***
## `nr. sessions`                     -0.05758    0.06873  -0.838 0.402170    
## `nr. rest days`                    -0.28656    0.08330  -3.440 0.000582 ***
## `total kms`                        -0.00437    0.00265  -1.649 0.099152 .  
## `total hours alternative training` -0.11027    0.04085  -2.699 0.006948 ** 
## `nr. strength trainings`            0.06048    0.06811   0.888 0.374496    
## `avg exertion`                      2.01601    2.43478   0.828 0.407667    
## `max exertion`                      0.55473    1.79439   0.309 0.757208    
## `avg training success`             -0.23085    0.31246  -0.739 0.460019    
## `avg recovery`                     -0.81802    0.77478  -1.056 0.291061    
## ac_7_21                            -0.03031    0.06297  -0.481 0.630315    
## ae_7_21                             0.07275    0.33287   0.219 0.827005    
## exer_v_rec_7_21                     0.23017    0.71383   0.322 0.747112    
## er_7_21                            -0.67293    0.95695  -0.703 0.481931    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3558.7  on 27553  degrees of freedom
## Residual deviance: 3478.9  on 27540  degrees of freedom
## AIC: 3506.9
## 
## Number of Fisher Scoring iterations: 7
step_model <- stepAIC(full_model, direction = "forward", scope = list(lower = ~1, upper = ~.), trace = FALSE)
summary(step_model)
## 
## Call:
## glm(formula = injury ~ `nr. sessions` + `nr. rest days` + `total kms` + 
##     `total hours alternative training` + `nr. strength trainings` + 
##     `avg exertion` + `max exertion` + `avg training success` + 
##     `avg recovery` + ac_7_21 + ae_7_21 + exer_v_rec_7_21 + er_7_21, 
##     family = binomial(), data = model)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.3103  -0.1743  -0.1444  -0.1230   3.5424  
## 
## Coefficients:
##                                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                        -3.58411    0.57505  -6.233 4.59e-10 ***
## `nr. sessions`                     -0.05758    0.06873  -0.838 0.402170    
## `nr. rest days`                    -0.28656    0.08330  -3.440 0.000582 ***
## `total kms`                        -0.00437    0.00265  -1.649 0.099152 .  
## `total hours alternative training` -0.11027    0.04085  -2.699 0.006948 ** 
## `nr. strength trainings`            0.06048    0.06811   0.888 0.374496    
## `avg exertion`                      2.01601    2.43478   0.828 0.407667    
## `max exertion`                      0.55473    1.79439   0.309 0.757208    
## `avg training success`             -0.23085    0.31246  -0.739 0.460019    
## `avg recovery`                     -0.81802    0.77478  -1.056 0.291061    
## ac_7_21                            -0.03031    0.06297  -0.481 0.630315    
## ae_7_21                             0.07275    0.33287   0.219 0.827005    
## exer_v_rec_7_21                     0.23017    0.71383   0.322 0.747112    
## er_7_21                            -0.67293    0.95695  -0.703 0.481931    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3558.7  on 27553  degrees of freedom
## Residual deviance: 3478.9  on 27540  degrees of freedom
## AIC: 3506.9
## 
## Number of Fisher Scoring iterations: 7

New Analysis

Lets investigate injuries as a part of our data set

Looks as if our current percentage of injuries as opposed to training days is 1.42 %

training_days <- sum(week$`nr. sessions` > 0)
sum(as.numeric(week$injury)) / training_days
## [1] 0.01419087
summary(as.factor(week$injury))
##     0     1 
## 42223   575
athlete <- week %>%
  group_by(`Athlete ID`) %>%
  summarise(Injuries = sum(as.numeric(injury))) %>%
  arrange(desc(Injuries)) %>%
  head(10)
athlete
## # A tibble: 10 × 2
##    `Athlete ID` Injuries
##           <dbl>    <dbl>
##  1           26       35
##  2           29       33
##  3           42       32
##  4           23       24
##  5            9       22
##  6           27       22
##  7           36       22
##  8           22       21
##  9           38       18
## 10           41       18
Injury_Plot <- ggplot(athlete, mapping = aes(x = Injuries)) +
  geom_density(alpha=0.5, fill="maroon") +
  labs(x = "Total Injuries",  
       title = "Distribution of Injuries among Athletes")
Injury_Plot

Lets focus on the most injured athlete

athlete1 <- week[week$`Athlete ID` == "26",]

# Create plot
Athlete_Workloads <- ggplot(athlete1, # Set data
              aes(x = `Date`, y = `avg exertion`, color = factor(injury))) + # Set aesthetics
  geom_point(alpha = 0.5) + # Set geom_point for scatter plot
  labs(x = "Days",  # Set labels
       title = "Athlete 26 Injuries",
       fill = "Injuries") +
  theme_bw() +
   theme(axis.line = element_line(colour = "black"), # Set axis line as black
        panel.grid.major = element_blank(), # Remove grid
        panel.grid.minor = element_blank(), # Remove grid
        panel.border = element_blank(), # Remove grid
        panel.background = element_blank()) +  # Remove grid 
  scale_color_manual(values = c("0" = "blue", "1" = "red"), # Set color values
                     labels = c("0" = "Healthy", "1" = "Injured"))

# Generate graph
ggMarginal(Athlete_Workloads , groupFill = TRUE,
           type = "histogram")

pivotdat <- pivot_longer(athlete1[, c("min recovery", "avg recovery", "max recovery", "min exertion", "avg exertion", "max exertion", "min training success", "avg training success", "max training success", "Athlete ID", "injury")],
                       !c("Athlete ID", "injury")) # Set variables to use as ID
pivotdatall <- pivot_longer(week[, c("min recovery", "avg recovery", "max recovery", "min exertion", "avg exertion", "max exertion", "min training success", "avg training success", "max training success", "Athlete ID", "injury")],
                       !c("Athlete ID", "injury")) # Set variables to use as ID
g <- ggplot(pivotdat,
              aes(x = value, fill = factor(injury))) + # Set aesthetics
  geom_density(alpha = 0.3) + # Set geom density for density plot
  labs(x = "Percieved Metric Values",  # Set labels
       title = "Injuries v Percieved Metric Values Leading Up to Injury",
       fill = "injury") +
  facet_wrap(~name, scale = "free") +
  theme_bw() + # Set theme
   theme(axis.line = element_line(colour = "black"), # Set axis line as black
        panel.grid.major = element_blank(), # Remove grid
        panel.grid.minor = element_blank(), # Remove grid
        panel.border = element_blank(), # Remove grid
        panel.background = element_blank()) + # Remove grid 
   scale_fill_manual(values = c("0" = "blue", "1" = "red"), # Set color values
                     labels = c("0" = "Healthy", "1" = "Injury"))

# Generate graph
g

gall <- ggplot(pivotdatall,
              aes(x = value, fill = factor(injury))) + # Set aesthetics
  geom_density(alpha = 0.3) + # Set geom density for density plot
  labs(x = "Percieved Metric Values",  # Set labels
       title = "Injuries v Percieved Metric Values Leading Up to Injury",
       fill = "injury") +
  facet_wrap(~name, scale = "free") +
  theme_bw() + # Set theme
   theme(axis.line = element_line(colour = "black"), # Set axis line as black
        panel.grid.major = element_blank(), # Remove grid
        panel.grid.minor = element_blank(), # Remove grid
        panel.border = element_blank(), # Remove grid
        panel.background = element_blank()) + # Remove grid 
   scale_fill_manual(values = c("0" = "blue", "1" = "red"), # Set color values
                     labels = c("0" = "Healthy", "1" = "Injury"))

# Generate graph
gall

Athlete ID 26 Week before injury

# Lets select top 10 most injured athletes to see if there are any outstanding similarities
athletes <- week[week$`Athlete ID` %in% athlete$`Athlete ID`,]
# Create graph
athlete_ex <- ggplot(athlete1, # Set data
              aes(x = `avg recovery.1`, y = `max exertion` , color = factor(injury))) + # Set aesthetics
  geom_point(alpha = 0.3) + # Set geom point for scatter plot
  labs(x = "Avg Recovery week before Injury",  # Set labels
       y = "Max Exertion week of Injury",
       title = "Max Exertion versus Average Recovery before injury",
       fill = "injury") +
  theme_bw() + # Set theme
   theme(axis.line = element_line(colour = "black"), # Set axis line as black
        panel.grid.major = element_blank(), # Remove grid
        panel.grid.minor = element_blank(), # Remove grid
        panel.border = element_blank(), # Remove grid
        panel.background = element_blank()) + # Remove grid 
   scale_color_manual(values = c("0" = "blue", "1" = "red"), # Set color values
                     labels = c("0" = "Healthy", "1" = "Injury"))

# Generate graph
ggMarginal(athlete_ex, groupFill = TRUE)

# Create graph
week_ex <- ggplot(week, # Set data
              aes(x = `avg recovery.1`, y = `max exertion`,color = factor(injury))) + # Set aesthetics
  geom_point(alpha = 0.3) + # Set geom point for scatter plot
  labs(x = "Avg Recovery week before Injury",  # Set labels
       y = "Max Exertion week of Injury",
       title = "Max Exertion versus Average Recovery before injury",
       fill = "injury") +
  theme_bw() + # Set theme
   theme(axis.line = element_line(colour = "black"), # Set axis line as black
        panel.grid.major = element_blank(), # Remove grid
        panel.grid.minor = element_blank(), # Remove grid
        panel.border = element_blank(), # Remove grid
        panel.background = element_blank()) + # Remove grid 
   scale_color_manual(values = c("0" = "blue", "1" = "red"), # Set color values
                     labels = c("0" = "Healthy", "1" = "Injury"))
# Generate graph
ggMarginal(week_ex, groupFill = TRUE)

set.seed(42069) # Set seed for reproducibility
trainIndex <- createDataPartition(y = week$injury, p = 0.75, list = FALSE)
trainData <- week[trainIndex, ]
testData <- week[-trainIndex, ]
validData <- week[week$injury==1,]
# c(1:4,6, 12:22, 73:76)
# Create training data
dtrain_1 <- xgb.DMatrix(data = as.matrix(trainData[,c(1:4,6, 12:22, 36:44, 58:66, 74:77)]), label = ((trainData$injury)))
# Create test data
dtest_1 <- xgb.DMatrix(data = as.matrix(testData[,c(1:4,6, 12:22, 36:44, 58:66, 74:77)]), label = ((testData$injury)))
fit_1 <- xgboost(dtrain_1,  # Set data set to use
                 nrounds = 200, # Set number of rounds
               eta = 0.05 , 
               verbose = 1, # 1 - Prints out fit
                print_every_n = 20, # Prints out result every 20th iteration
               
               objective = "binary:logistic", # Set objective
               eval_metric = "auc",
               eval_metric = "error")
## [1]  train-auc:0.611286  train-error:0.013303 
## [21] train-auc:0.627602  train-error:0.013178 
## [41] train-auc:0.703593  train-error:0.013209 
## [61] train-auc:0.799753  train-error:0.013240 
## [81] train-auc:0.866385  train-error:0.013271 
## [101]    train-auc:0.917969  train-error:0.013240 
## [121]    train-auc:0.939627  train-error:0.013085 
## [141]    train-auc:0.949698  train-error:0.012929 
## [161]    train-auc:0.960221  train-error:0.012742 
## [181]    train-auc:0.967799  train-error:0.012493 
## [200]    train-auc:0.974267  train-error:0.012430
preds_injury <- predict(fit_1, dtest_1)
roc1 = roc((testData$injury), preds_injury)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot.roc(roc1, print.auc = TRUE, col = "red", print.auc.col = "red")

pred_class <- rep(0, length(preds_injury))

# If predicted probability above cutoff set as 1
pred_class[which(preds_injury > 0.0145)] <- 1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7946   61
##          1 2605   87
##                                          
##                Accuracy : 0.7508         
##                  95% CI : (0.7425, 0.759)
##     No Information Rate : 0.9862         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.036          
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.75310        
##             Specificity : 0.58784        
##          Pos Pred Value : 0.99238        
##          Neg Pred Value : 0.03232        
##              Prevalence : 0.98617        
##          Detection Rate : 0.74269        
##    Detection Prevalence : 0.74839        
##       Balanced Accuracy : 0.67047        
##                                          
##        'Positive' Class : 0              
## 
shap_result_1 <- shap.score.rank(xgb_model = fit_1, 
                X_train = as.matrix(trainData[,c(1:4,6, 12:22, 36:44, 58:66, 74:77)]),
                shap_approx = F)
## Loading required package: data.table
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## make SHAP score by decreasing order
shap_long_1 = shap.prep(shap = shap_result_1,
                           X_train =  as.matrix(trainData[,c(1:4,6, 12:22, 36:44, 58:66, 74:77)]), 
                           top_n = 8)
## Loading required package: ggforce
plot.shap.summary(data_long = shap_long_1)

Future Steps Could possibly redo study to focus on the injuries of the top 10 most injured runners to see if they have any signficant metrics that lead to injury.

athletes <- week[week$`Athlete ID` %in% athlete$`Athlete ID`,]